################################################################################
##
## Purpose: This script prepares the final data for NLP scripts.
##
## Author: James Bisbee (james.h.bisbee@vanderbilt.edu)
##
##  - Inputs:
##    - ./data/prepped/finalData_12_25_2021.RData: Prepped data from 1_DATA_merge.R
##    - ./data/prepped/bills/binder_bills_merged.RData: Prepped data from 5_DATA_bill_prep.R
##    - ./data/raw/politicians/Yellen_vote.csv: Raw data from https://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=117&session=1&vote=00006
##  - Outputs:
##    - ./data/prepped/finalData_for_NLP.RData
##
##
## See associated log file for compute environment, package versions, 
##  and date of most recent run.
##
################################################################################
rm(list = ls())
gc()
require(tidyverse)

set.seed(123)

# Compute details
print(paste0('Compute environment from ',Sys.Date(),' run by Bisbee'))
if(Sys.info()['sysname'] == 'Windows') {
  ram_size = system("wmic MemoryChip get Capacity", intern = TRUE)[-1]
  model_name = system("wmic cpu get name", intern = TRUE)[2] # nocov
  vendor_id = system("wmic cpu get manufacturer", intern = TRUE)[2] # nocov
  
  print(list(ram = stringr::str_squish(ram_size)[1],
             vendor_id = stringr::str_squish(vendor_id),
             model_name = stringr::str_squish(model_name),
             no_of_cores = parallel::detectCores()))
} else if(Sys.info()['sysname'] == 'Linuxs') {
  splitted <- strsplit(system("ps -C rsession -o %cpu,%mem,pid,cmd", intern = TRUE), " ")
  df <- do.call(rbind, lapply(splitted[-1], 
                              function(x) data.frame(
                                cpu = as.numeric(x[2]),
                                mem = as.numeric(x[4]),
                                pid = as.numeric(x[5]),
                                cmd = paste(x[-c(1:5)], collapse = " "))))
  df
} else {
  cat("If not on Linux or Windows, you'll have to figure out your own solution to seeing the compute environment.")
}

sessionInfo()


# -------------------------------------------------------------- Loading data
load('./data/prepped/finalData_12_25_2021.RData')
# Binder bill data: binder_bills_merged.RData created by /Data/Bills/bill_prep.R
load('./data/prepped/bills/binder_bills_merged.RData')

# Yellen nomination vote data: Yellen_vote.csv from https://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=117&session=1&vote=00006
vote <- read_csv('./data/raw/politicians/Yellen_vote.csv',
                 col_names = c('speaker','Party','stab','yellen_vote'))


# Cleaning the data further.
finalMerge <- finalMerge %>%
  mutate(fullInd = row_number(),
         textclean = trimws(gsub('\\s{2,}',' ',gsub('\\\r|\\\n',' ',gsub('\\\n\\s{3,}[[:upper:]]{2,}.*\\\r','',textclean))))) %>%
  mutate(nchars = nchar(textclean))

# Filling in missing demographic data
finalMerge %>%
  select(opensecretsID,gender,party,nominate_dim1,age,seniority) %>%
  filter(!is.na(opensecretsID),
         !complete.cases(.)) %>% distinct()

finalMerge$party[which(is.na(finalMerge$party))] <- finalMerge$position[which(is.na(finalMerge$party))]

finalMerge <- finalMerge %>%
  mutate(gender = ifelse(opensecretsID %in% c('FEDGREENSPAN',
                                              'FEDBERNANKE',
                                              'FEDPOWELL'),'M',
                         ifelse(opensecretsID == 'FEDYELLEN','F',gender)))

finalMerge <- finalMerge %>% filter(opensecretsID != 'ADMIN')

finalMerge %>%
  select(opensecretsID,stab,gender,party,nominate_dim1,
         votepct_rel,votepct,nKids,nDaughters,nSons,firstDaughter) %>%
  filter(!is.na(opensecretsID),
         !complete.cases(.)) %>% distinct() 



finalMerge <- finalMerge %>%
  mutate(votepct = ifelse(opensecretsID == 'N00042619',.549,
                          ifelse(grepl('FED|EXPERT',opensecretsID),1,votepct)),
         votepct_rel = ifelse(opensecretsID == 'N00042619',.549-.44,
                              ifelse(grepl('FED|EXPERT',opensecretsID),1,votepct_rel)),
         stab = ifelse(grepl('FED|EXPERT',opensecretsID),'DC',stab)) # https://ballotpedia.org/Michael_F.Q._San_Nicolas

finalMerge <- finalMerge %>%
  mutate(nKids = ifelse(opensecretsID == 'FEDPOWELL',3,
                        ifelse(opensecretsID == 'FEDBERNANKE',2,
                               ifelse(opensecretsID == 'FEDGREENSPAN',0,
                                      ifelse(opensecretsID == 'FEDYELLEN',1,
                                             ifelse(opensecretsID == 'EXPERTKOHN',2,
                                                    ifelse(opensecretsID == 'EXPERTKOO',2,
                                                           ifelse(opensecretsID == 'EXPERTMELTZER',2,
                                                                  ifelse(opensecretsID == 'EXPERTTAYLOR',2,
                                                                         ifelse(opensecretsID == 'EXPERTMCCLOSKEY',2,
                                                                                ifelse(grepl('EXPERT',opensecretsID),0,nKids)))))))))),
         nDaughters = ifelse(opensecretsID == 'FEDPOWELL',2,
                             ifelse(opensecretsID == 'FEDBERNANKE',0,
                                    ifelse(opensecretsID == 'FEDGREENSPAN',0,
                                           ifelse(opensecretsID == 'FEDYELLEN',0,
                                                  ifelse(opensecretsID == 'EXPERTKOHN',1,
                                                         ifelse(opensecretsID == 'EXPERTKOO',0,
                                                                ifelse(opensecretsID == 'EXPERTMELTZER',0,
                                                                       ifelse(opensecretsID == 'EXPERTTAYLOR',1,
                                                                              ifelse(opensecretsID == 'EXPERTMCCLOSKEY',1,
                                                                                     ifelse(grepl('EXPERT',opensecretsID),0,nDaughters)))))))))),
         nSons = ifelse(opensecretsID == 'FEDPOWELL',1,
                        ifelse(opensecretsID == 'FEDBERNANKE',2,
                               ifelse(opensecretsID == 'FEDGREENSPAN',0,
                                      ifelse(opensecretsID == 'FEDYELLEN',1,
                                             ifelse(opensecretsID == 'EXPERTKOHN',1,
                                                    ifelse(opensecretsID == 'EXPERTKOO',2,
                                                           ifelse(opensecretsID == 'EXPERTMELTZER',2,
                                                                  ifelse(opensecretsID == 'EXPERTTAYLOR',1,
                                                                         ifelse(opensecretsID == 'EXPERTMCCLOSKEY',1,
                                                                                ifelse(grepl('EXPERT',opensecretsID),0,nSons)))))))))),
         firstDaughter = ifelse(opensecretsID == 'FEDPOWELL',1,
                                ifelse(opensecretsID == 'FEDBERNANKE',0,
                                       ifelse(opensecretsID == 'FEDGREENSPAN',0,
                                              ifelse(opensecretsID == 'FEDYELLEN',0,
                                                     ifelse(opensecretsID == 'EXPERTKOHN',1,
                                                            ifelse(opensecretsID == 'EXPERTKOO',0,
                                                                   ifelse(opensecretsID == 'EXPERTMELTZER',0,
                                                                          ifelse(opensecretsID == 'EXPERTTAYLOR',0,
                                                                                 ifelse(opensecretsID == 'EXPERTMCCLOSKEY',0,
                                                                                        ifelse(grepl('EXPERT',opensecretsID),0,firstDaughter)))))))))))




# -------------------------------------------------------------- Merging Step 1
# Merging with the bills
finalMerge %>%
  left_join(finalBills) %>%
  select(opensecretsID,matches('_tot|Bill')) %>%
  drop_na()

finalMerge <- finalMerge %>%
  left_join(finalBills)


# Finally merging with the vote records of who voted against Yellen
finalMerge <- finalMerge %>%
  left_join(finalMerge %>%
              filter(chamber == "Senate",
                     year == 2014) %>%
              select(opensecretsID,speaker,party,stab,chamber) %>%
              distinct() %>%
              mutate(speaker = gsub('Chairman |Chairwoman |Senator |\\.','',speaker)) %>%
              left_join(vote %>% mutate(party = Party)) %>%
              mutate(yellen_vote = ifelse(yellen_vote %in% c('NAY','NV'),1,0)) %>%
              select(opensecretsID,yellen_vote) %>%
              filter(!is.na(yellen_vote))) %>%
  mutate(yellen_vote = ifelse(is.na(yellen_vote),0,yellen_vote))


# Save for toxicity prep and topic models
save(finalMerge,file = './data/prepped/finalData_for_NLP.RData')

# EOF